
#################This code generates figure 3,4 and 5####################
library(RColorBrewer)
library(latex2exp)

set.seed(213)

#################Use different n#################
n_1 <- 100
n_2 <- 1000
n_3 <- 10000


#################Setting up for different q*, upper bound for the conditional probability of M_ij being observed defined in assumption 2#################
q_1 <- 0.75
q_2 <- 0.9
q_3 <- 0.99


#################Compute probability of all missing and expected proportion of missing
computation <- expand.grid(k = seq(1, 150, 1), n = c(n_1, n_2, n_3),
            q = c(q_1,q_2,q_3))
computation$p <- (1-computation$q^(computation$k))^(computation$n)
computation$y <- 1-computation$q^(computation$k) # expected proportion of missing (does not depend on n)

computation

#################Colors Vector#################
cols<-brewer.pal(n=3,name="Set1")
cols[1]
#################Quartz Code is used to save the exact figure in the Mac OS environment, for windows and other environments ignore this command##################
##################Figure 3##################
quartz(width=5,height=6)

par(mfrow=c(3,1))
##################First panel, n=100##################
plot(p~k,data=computation, subset=((n==100) & (q == q_1)), type="p",pch=20, col=cols[1],ylab="Probability",xlab="Number of variables",main="Lower bound on probability that all rows missing (n = 100)")
text(y=0.5,x=25,TeX('$q_*=0.75$'))
lines(p~k,data=computation, subset=((n==100) & (q == q_2)), type="p", pch=20,col=cols[2])
text(y=0.5,x=58,TeX('$q_*=0.90$'))
lines(p~k,data=computation, subset=((n==100) & (q == q_3)), type="p", pch=20,col=cols[3])
text(y=0.2,x=100,TeX('$q_*=0.99$'))


##################Second panel, n=1,000##################
plot(p~k,data=computation, subset=((n==1000) & (q == q_1)), type="p", pch=20,col=cols[1],ylab="Probability",xlab="Number of variables",main="Lower bound on probability that all rows missing (n = 1000)")
text(y=0.5,x=34,TeX('$q_*=0.75$'))

lines(p~k,data=computation, subset=((n==1000) & (q == q_2)), type="p",pch=20, col=cols[2])
text(y=0.5,x=78,TeX('$q_*=0.90$'))

lines(p~k,data=computation, subset=((n==1000) & (q == q_3)), type="p", pch=20,col=cols[3])
text(y=0.2,x=110,TeX('$q_*=0.99$'))

##################Third panel, n=10,000##################
plot(p~k,data=computation, subset=((n==10000) & (q == q_1)), type="p", pch=20,col=cols[1],ylab="Probability",xlab="Number of variables",main="Lower bound on probability that all rows missing (n = 10000)")
text(y=0.5,x=42,TeX('$q_*=0.75$'))

lines(p~k,data=computation, subset=((n==10000) & (q == q_2)), type="p", pch=20,col=cols[2])
text(y=0.5,x=100,TeX('$q_*=0.90$'))

lines(p~k,data=computation, subset=((n==10000) & (q == q_3)), type="p", pch=20,col=cols[3])
text(y=0.2,x=120,TeX('$q_*=0.99$'))

quartz.save("figure3.pdf", type = "pdf", device = dev.cur(), dpi = 600)



####

# Find n such that prob all units missing is .5 (or anything); solution when k allowed to be real-valued
# (1-q^k)^(n) = 0.5
# 1-q^k = 0.5^(1/n)
# q^k = 1-0.5^(1/n)
# k * log(q) = log( 1-0.5^(1/n))
# k = log( 1-0.5^(1/n))/log(q)

##################Computing the number of variables needed 
computation2 <- expand.grid(n = c(100:10000),
            q = c(q_1,q_2,q_3))

computation2$k50 <- floor(with(computation2, log( 1-0.5^(1/n))/log(q)))
computation2$k99 <- floor(with(computation2, log( 1-0.99^(1/n))/log(q)))

##################Displaying In-Text Numbers##################

##################n=10,000, q_* = 0.75##################
computation2[9901,]

##################n=10,000, q_* = 0.99##################
computation2[29703,]

##################Figure 4-1##################
quartz(width=6,height=6)

par(mfrow=c(1,1))
plot(k50~n,data=computation2,ylab="k",main=TeX("Largest k such that probability all data missing is $\\leq 0.50$"),col="white")

lines(k50~n,data=computation2, subset=((q == q_1)), type="p", pch=20,col=cols[1])
text(y=5,x=9500,TeX('$q_*=0.75$'))

lines(k50~n,data=computation2, subset=((q == q_2)), type="p", pch=15,col=cols[2])
text(y=120,x=6000,TeX('$q_*=0.90$'))


lines(k50~n,data=computation2, subset=((q == q_3)), type="p", pch=20,col=cols[3])
text(y=700,x=2000,TeX('$q_*=0.99$'))
quartz.save("figure4-1.pdf", type = "pdf", device = dev.cur(), dpi = 600)


##################Figure 4-2##################
quartz(width=6,height=6)

par(mfrow=c(1,1))
plot(k99~n,data=computation2,ylab="k",main=TeX("Largest k such that probability all data missing is $\\leq 0.99$"),col="white")
lines(k99~n,data=computation2, subset=((q == q_2)), type="p", pch=15,col=cols[2])
text(y=5,x=9500,TeX('$q_*=0.75$'))

lines(k99~n,data=computation2, subset=((q == q_1)), type="p", pch=20,col=cols[1])
text(y=180,x=6000,TeX('$q_*=0.90$'))

lines(k99~n,data=computation2, subset=((q == q_3)), type="p", pch=20,col=cols[3])
text(y=1100,x=2000,TeX('$q_*=0.99$'))

quartz.save("figure4-2.pdf", type = "pdf", device = dev.cur(), dpi = 600)



##################Figure 5##################
quartz(width=6,height=6)

par(mfrow=c(1,1))
plot(y~k,data=computation,subset=(n==100 & q == 0.75), type="p", ylab="Expected proportion missing", xlab="Number of Variables", ylim=c(0,1),pch=20,col = cols[1], main="Lower bound on expected proportion of data missing")
text(y=1,x=5,TeX('$q_*=0.75$'))
lines(y~k,data=computation,subset=(n==100 & q == 0.9),type="p",pch=20,col=cols[2])
text(y=0.9,x=33,TeX('$q_*=0.90$'))
lines(y~k,data=computation,subset=(n==100 & q == 0.99),type="p", pch=20,col=cols[3])
text(y=0.55,x=100,TeX('$q_*=0.99$'))

quartz.save("figure5.pdf", type = "pdf", device = dev.cur(), dpi = 600)

